Load packages:
library(tidyverse)
library(labelled)Resources:
num_obs <- 10000
# Generate standard normal distribution (default is mean = 0 and sd = 1)
set.seed(124)
stdnorm_dist <- rnorm(num_obs, mean = 0, sd = 1) # equivalent to rnorm(10)
# Generate normal distribution w/ custom mean and sd
set.seed(124)
norm_dist <- rnorm(num_obs, mean = 50, sd = 5)
# Generate left-skewed distribution
set.seed(124)
lskew_dist <- rbeta(num_obs, 5, 2)
# Generate right-skewed distribution
set.seed(124)
rskew_dist <- rbeta(num_obs, 2, 5)
# Create dataframe
df_generated_pop <- data.frame(stdnorm_dist, norm_dist, lskew_dist, rskew_dist)load(file = url('https://github.com/anyone-can-cook/educ152/raw/main/data/ipeds/output_data/panel_data.RData'))
# Create ipeds data frame with fewer variables/observations
df_ipeds_pop <- panel_data %>%
# keep IPEDS tuition data from fall of which year (e.g., fall 2016 is price for programs in 2016-17 academic year)
filter(year == 2016) %>%
# which universities to keep:
# 2015 carnegie classification: keep research universities (15,16,17) and master's universities (18,19,20)
filter(c15basic %in% c(15,16,17,18,19,20)) %>%
# which variables to keep
select(instnm,unitid,opeid6,opeid,control,c15basic,stabbr,city,zip,locale,obereg, # basic institutional characteristics
tuition6,fee6,tuition7,fee7, # avg tuition and fees for full-time grad, in-state and out-of-state
isprof3,ispfee3,osprof3,ospfee3, # avg tuition and fees for MD, in-state and out-of-state
isprof9,ispfee9,osprof9,ospfee9, # avg tuition and fees for Law, in-state and out-of-state
chg4ay3,chg7ay3,chg8ay3) %>% # [undergraduate] books+supplies; off-campus (not with family) room and board; off-campus (not with family) other expenses
# rename variables; syntax <new_name> = <old_name>
rename(region = obereg, # revion
tuit_grad_res = tuition6, fee_grad_res = fee6, tuit_grad_nres = tuition7, fee_grad_nres = fee7, # grad
tuit_md_res = isprof3, fee_md_res = ispfee3, tuit_md_nres = osprof3, fee_md_nres = ospfee3, # md
tuit_law_res = isprof9, fee_law_res = ispfee9, tuit_law_nres = osprof9, fee_law_nres = ospfee9, # law
books_supplies = chg4ay3, roomboard_off = chg7ay3, oth_expense_off = chg8ay3) %>% # [undergraduate] expenses
# create measures of tuition+fees
mutate(
tuitfee_grad_res = tuit_grad_res + fee_grad_res, # graduate, state resident
tuitfee_grad_nres = tuit_grad_nres + fee_grad_nres, # graduate, non-resident
tuitfee_md_res = tuit_md_res + fee_md_res, # MD, state resident
tuitfee_md_nres = tuit_md_nres + fee_md_nres, # MD, non-resident
tuitfee_law_res = tuit_law_res + fee_law_res, # Law, state resident
tuitfee_law_nres = tuit_law_nres + fee_law_nres) %>% # Law, non-resident
# create measures of cost-of-attendance (COA) as the sum of tuition, fees, book, living expenses
mutate(
coa_grad_res = tuit_grad_res + fee_grad_res + books_supplies + roomboard_off + oth_expense_off, # graduate, state resident
coa_grad_nres = tuit_grad_nres + fee_grad_nres + books_supplies + roomboard_off + oth_expense_off, # graduate, non-resident
coa_md_res = tuit_md_res + fee_md_res + books_supplies + roomboard_off + oth_expense_off, # MD, state resident
coa_md_nres = tuit_md_nres + fee_md_nres + books_supplies + roomboard_off + oth_expense_off, # MD, non-resident
coa_law_res = tuit_law_res + fee_law_res + books_supplies + roomboard_off + oth_expense_off, # Law, state resident
coa_law_nres = tuit_law_nres + fee_law_nres + books_supplies + roomboard_off + oth_expense_off) # %>% # Law, non-resident
# [COMMENTED THIS OUT] keep only observations that have non-missing values for the variable coa_grad_res
# this does cause us to lose some interesting universities, but doing this will eliminate some needless complications with respect to learning core concepts about statistical inference
#filter(!is.na(coa_grad_res))
# Add variable labels to the tuit+fees variables and coa variables
# tuition + fees variables
var_label(df_ipeds_pop[['tuitfee_grad_res']]) <- 'graduate, full-time, resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_grad_nres']]) <- 'graduate, full-time, non-resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_md_res']]) <- 'MD, full-time, state resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_md_nres']]) <- 'MD, full-time, non-resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_law_res']]) <- 'Law, full-time, state resident; avg tuition + required fees'
var_label(df_ipeds_pop[['tuitfee_law_nres']]) <- 'Law, full-time, non-resident; avg tuition + required fees'
# COA variables
var_label(df_ipeds_pop[['coa_grad_res']]) <- 'graduate, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_grad_nres']]) <- 'graduate, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_md_res']]) <- 'MD, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_md_nres']]) <- 'MD, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_law_res']]) <- 'Law, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
var_label(df_ipeds_pop[['coa_law_nres']]) <- 'Law, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
#df_ipeds_pop %>% glimpse()
load(file = url('https://github.com/anyone-can-cook/educ152/raw/main/data/college_scorecard/output_data/df_debt_earn_panel_labelled.RData'))
df_scorecard <- df_debt_earn_panel_labelled %>%
# keep most recent year of data
filter(field_ay == '2017-18') %>%
# keep master's degrees
filter(credlev == 5) %>%
# carnegie categories to keep: 15 = Doctoral Universities: Very High Research Activity; 16 = Doctoral Universities: High Research Activity
# note: variable ccbasic from scorecard data is 2015 carnegie classification
filter(ccbasic %in% c(15,16,17,18,19,20)) %>%
# drop "parent plus" loan variables and other vars we won't use in this lecture
select(-contains('_pp'),-contains('_any'),-field_ay,-st_fips,-zip,-longitude,-latitude,-locale2,-highdeg,-accredagency,-relaffil,-hbcu,-annhi,-tribal,-aanapii,-hsi,-nanti,-main,-numbranch,-control) %>%
# create variable for broad field of degree (e.g., education, business)
mutate(cipdig2 = str_sub(string = cipcode, start = 1, end = 2)) %>%
# shorten variable cipdesc to make it more suitable for printing
mutate(cipdesc = str_sub(string = cipdesc, start = 1, end = 50)) %>%
# re-order variables
relocate(opeid6,unitid,instnm,ccbasic,stabbr,city,cipdig2)
# For debt and earnings variables, convert from character to numeric variables (which replaces "PrivacySuppressed" values with NA values)
df_scorecard <- df_scorecard %>%
mutate(
debt_all_stgp_eval_n = as.numeric(debt_all_stgp_eval_n),
debt_all_stgp_eval_mean = as.numeric(debt_all_stgp_eval_mean),
debt_all_stgp_eval_mdn = as.numeric(debt_all_stgp_eval_mdn),
debt_all_stgp_eval_mdn10yrpay = as.numeric(debt_all_stgp_eval_mdn10yrpay),
earn_count_wne_hi_1yr = as.numeric(earn_count_wne_hi_1yr),
earn_mdn_hi_1yr = as.numeric(earn_mdn_hi_1yr),
earn_count_wne_hi_2yr = as.numeric(earn_count_wne_hi_2yr),
earn_mdn_hi_2yr = as.numeric(earn_mdn_hi_2yr)
)
# add variable label to variable cipdig2
attr(df_scorecard[['cipdig2']], which = 'label') <- 'broad degree field code = 2-digit classification of instructional programs (CIP) degree code'
# add variable label attribute back to debt and earnings variables
for(v in c('debt_all_stgp_eval_n','debt_all_stgp_eval_mean','debt_all_stgp_eval_mdn','debt_all_stgp_eval_mdn10yrpay','earn_count_wne_hi_1yr','earn_mdn_hi_1yr','earn_count_wne_hi_2yr','earn_mdn_hi_2yr','cipdesc')) {
attr(df_scorecard[[v]], which = 'label') <- attr(df_debt_earn_panel_labelled[[v]], which = 'label')
}
rm(df_debt_earn_panel_labelled) # comment this line out if you want to keep data frame
df_score_ipeds <- df_ipeds_pop %>%
select(-instnm,-opeid6,-opeid,-c15basic,-region,-locale,-city,-stabbr,-zip) %>% mutate(one=1) %>%
right_join(y=df_scorecard, by = 'unitid')
df_score_ipeds <- df_score_ipeds %>%
# drop unitids from scorecard that don't merge to ipeds data (on tuition)
filter(!is.na(one)) %>%
# drop observations that don't have mean debt data
filter(!is.na(debt_all_stgp_eval_mean)) %>%
# drop for-profits
filter(unclass(control) !=3) %>%
# drop tuition/coa vars for law and md
select(-one,-contains('law'),-contains('md_')) %>%
relocate(opeid6,unitid,instnm,control,ccbasic,stabbr,region,city,locale,cipdig2,cipcode,cipdesc,credlev,creddesc,
contains('ipeds'),starts_with('debt'),starts_with('earn'))
# create data frame that only contains observations for MAs in social work; filter(cipcode=='4407')
df_socialwork <- df_score_ipeds %>%
filter(cipcode=='4407') %>%
# remove observations with missing values of cost of attendance (better for teaching concepts)
filter(!is.na(coa_grad_res))load(file = url('https://github.com/anyone-can-cook/educ152/raw/main/data/college_scorecard/output_data/df_debt_earn_mba.Rdata'))df_ipeds_pop %>% head()## # A tibble: 6 x 38
## instnm unitid opeid6 opeid control c15basic stabbr city zip locale region tuit_grad_res fee_grad_res tuit_grad_nres fee_grad_nres tuit_md_res fee_md_res tuit_md_nres fee_md_nres tuit_law_res fee_law_res tuit_law_nres fee_law_nres books_supplies roomboard_off oth_expense_off tuitfee_grad_res tuitfee_grad_nres tuitfee_md_res tuitfee_md_nres tuitfee_law_res tuitfee_law_nres coa_grad_res coa_grad_nres coa_md_res coa_md_nres coa_law_res coa_law_nres
## <chr> <dbl> <chr> <chr> <dbl+lbl> <dbl+lbl> <chr+lbl> <chr> <chr> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama A & M University 100654 001002 00100200 1 [Public] 18 [Master^s Colleges & Universities: Larger Programs] AL [Alabama] Normal 35762 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 8130 1236 16260 1236 NA NA NA NA NA NA NA NA 1600 8830 3090 9366 17496 NA NA NA NA 22886 31016 NA NA NA NA
## 2 University of Alabama at Birmingham 100663 001052 00105200 1 [Public] 15 [Doctoral Universities: Highest Research Activity] AL [Alabama] Birmingham 35294-0110 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 7588 0 17294 0 26778 0 61848 0 NA NA NA NA 1200 11682 4886 7588 17294 26778 61848 NA NA 25356 35062 44546 79616 NA NA
## 3 Amridge University 100690 025034 02503400 2 [Private not-for-profit] 20 [Master^s Colleges & Universities: Small Programs] AL [Alabama] Montgomery 36117-3553 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 11700 1390 11700 1390 NA NA NA NA NA NA NA NA 1500 9600 1600 13090 13090 NA NA NA NA 25790 25790 NA NA NA NA
## 4 University of Alabama in Huntsville 100706 001055 00105500 1 [Public] 16 [Doctoral Universities: Higher Research Activity] AL [Alabama] Huntsville 35899 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 9834 508 21830 508 NA NA NA NA NA NA NA NA 1688 9603 3578 10342 22338 NA NA NA NA 25211 37207 NA NA NA NA
## 5 Alabama State University 100724 001005 00100500 1 [Public] 19 [Master^s Colleges & Universities: Medium Programs] AL [Alabama] Montgomery 36104-0271 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 6174 2284 12348 2284 NA NA NA NA NA NA NA NA 1600 7320 4228 8458 14632 NA NA NA NA 21606 27780 NA NA NA NA
## 6 The University of Alabama 100751 001051 00105100 1 [Public] 16 [Doctoral Universities: Higher Research Activity] AL [Alabama] Tuscaloosa 35487-0166 13 [City: Small] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV] 10470 0 26950 0 26778 0 61848 0 22760 0 38820 0 1200 13050 4116 10470 26950 26778 61848 22760 38820 28836 45316 45144 80214 41126 57186
df_generated_pop %>% head()## stdnorm_dist norm_dist lskew_dist rskew_dist
## 1 -1.38507062 43.07465 0.9172827 0.08271725
## 2 0.03832318 50.19162 0.7064826 0.29351743
## 3 -0.76303016 46.18485 0.8444117 0.15558827
## 4 0.21230614 51.06153 0.6694614 0.33053865
## 5 1.42553797 57.12769 0.3488487 0.65115131
## 6 0.74447982 53.72240 0.5401472 0.45985283
df_socialwork %>% head()## # A tibble: 6 x 35
## opeid6 unitid instnm control ccbasic stabbr region city locale cipdig2 cipcode cipdesc credlev creddesc ipedscount1 ipedscount2 debt_all_stgp_eval_n debt_all_stgp_eval_mean debt_all_stgp_eval_mdn debt_all_stgp_eval_mdn10yrpay earn_count_wne_hi_1yr earn_mdn_hi_1yr earn_count_wne_hi_2yr earn_mdn_hi_2yr tuit_grad_res fee_grad_res tuit_grad_nres fee_grad_nres books_supplies roomboard_off oth_expense_off tuitfee_grad_res tuitfee_grad_nres coa_grad_res coa_grad_nres
## <chr> <dbl> <chr> <dbl+lbl> <dbl+lbl> <chr> <dbl+lbl> <chr> <dbl+lbl> <chr> <chr> <chr> <dbl+lbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 001002 100654 Alabama A & M University 1 [Public] 18 [Master's Colleges & Universities: Larger Programs] AL 5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Normal 12 [City: Midsize (population of at least 100,000 but less than 250,000)] 44 4407 Social Work. 5 [Master's Degree] Master's Degree 92 79 146 51788 50446 518 138 38065 83 37819 8130 1236 16260 1236 1600 8830 3090 9366 17496 22886 31016
## 2 001005 100724 Alabama State University 1 [Public] 19 [Master's Colleges & Universities: Medium Programs] AL 5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Montgomery 12 [City: Midsize (population of at least 100,000 but less than 250,000)] 44 4407 Social Work. 5 [Master's Degree] Master's Degree 7 25 NA 38001 41000 421 NA NA NA NA 6174 2284 12348 2284 1600 7320 4228 8458 14632 21606 27780
## 3 001051 100751 The University of Alabama 1 [Public] 15 [Doctoral Universities: Very High Research Activity] AL 5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Tuscaloosa 12 [City: Midsize (population of at least 100,000 but less than 250,000)] 44 4407 Social Work. 5 [Master's Degree] Master's Degree 189 225 244 32830 30750 316 276 39254 292 38981 10470 0 26950 0 1200 13050 4116 10470 26950 28836 45316
## 4 001036 102049 Samford University 2 [Private not-for-profit] 17 [Doctoral/Professional Universities] AL 5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Birmingham 21 [Suburb: Large (outside principal city, in urbanized area with population of 250,000 or more)] 44 4407 Social Work. 5 [Master's Degree] Master's Degree 8 13 17 49278 NA NA NA NA NA NA 18530 610 18530 610 1000 9850 5498 19140 19140 35488 35488
## 5 001047 102368 Troy University 1 [Public] 18 [Master's Colleges & Universities: Larger Programs] AL 5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Troy 33 [Town: Remote (in urban cluster more than 35 miles from an urbanized area)] 44 4407 Social Work. 5 [Master's Degree] Master's Degree 106 86 145 30072 23141 238 104 39609 20 38289 7146 802 14292 802 1129 5815 5227 7948 15094 20119 27265
## 6 011462 102553 University of Alaska Anchorage 1 [Public] 18 [Master's Colleges & Universities: Larger Programs] AK 8 [Far West (AK, CA, HI, NV, OR, WA)] Anchorage 11 [City: Large (population of 250,000 or more)] 44 4407 Social Work. 5 [Master's Degree] Master's Degree 20 19 20 30364 26831 275 24 50606 27 51750 9768 1424 19954 1758 1608 11728 7568 11192 21712 32096 42616
# Function to get sampling distribution (default: 1000 samples of size 200)
get_sampling_distribution <- function(data_vec, num_samples = 1000, sample_size = 200) {
sample_means <- vector(mode = 'numeric', num_samples)
for (i in 1:length(sample_means)) {
samp <- sample(data_vec, sample_size)
sample_means[[i]] <- mean(samp, na.rm = T)
}
sample_means
}
# Function to generate plot
plot_distribution <- function(data_vec1, data_vec2 = NULL, data_df = NULL, data_var = NULL, group_var = NULL, group_cat = NULL, pop_labels = NULL, show_group_hist = F, sampling_dist = F, plot_title = '') {
two_pop <- !is.null(group_var) || !is.null(data_vec2)
two_dist <- two_pop && !sampling_dist
# Prep dataframe
if (!is.null(data_df)) {
data_df[[data_var]] <- unclass(data_df[[data_var]]) # unclass haven_labelled
if (two_pop) {
data_df[[group_var]] <- unclass(data_df[[group_var]])
}
data_df <- data_df %>% filter(!is.na(get(data_var))) # remove NA rows
# If group_cat not provided, use 2 values from group_var
if (two_pop && is.null(group_cat)) {
group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
}
# Create population vector(s)
if (!two_pop) { # single population
data_vec1 <- data_df[[data_var]]
} else { # two populations
data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
}
}
# Get sampling distribution
if (sampling_dist) {
if (!two_pop) { # single population
data_vec1 <- get_sampling_distribution(data_vec1)
} else { # two populations
data_vec1_samp <- get_sampling_distribution(data_vec1)
data_vec2_samp <- get_sampling_distribution(data_vec2)
data_vec1 <- data_vec1_samp - data_vec2_samp
}
}
# Legend text
legend_text <- c(paste('Mean:', round(mean(data_vec1), 2),
'\nStd Dev:', round(sd(data_vec1), 2)),
paste('Median:', round(median(data_vec1), 2)))
if (two_dist) {
legend_text <- c(legend_text,
paste('Mean:', round(mean(data_vec2), 2),
'\nStd Dev:', round(sd(data_vec2), 2)),
paste('Median:', round(median(data_vec2), 2)))
}
if (!is.null(pop_labels)) {
pop1 <- pop_labels[[1]]
pop2 <- pop_labels[[2]]
} else if (!is.null(group_var)) {
pop1 <- str_c(group_var, '=', group_cat[[1]])
pop2 <- str_c(group_var, '=', group_cat[[2]])
} else {
pop1 <- 'Pop1'
pop2 <- 'Pop2'
}
# Create statistics dataframe
if (!two_dist) {
lines_vec <- c('dotted')
stats_vec <- c(mean(data_vec1), median(data_vec1))
legend_title <- 'Statistics'
if (two_pop && sampling_dist) {
legend_title <- str_c('Statistics\n(', pop1, ' - ', pop2, ')')
}
} else {
lines_vec <- c('dotted', 'dotdash')
stats_vec <- c(mean(data_vec1), median(data_vec1), mean(data_vec2), median(data_vec2))
legend_title <- str_c('Statistics\n(', pop1, ' vs. ', pop2, ')')
}
stats_df <- data.frame(
pop = rep(lines_vec, each = 2),
stat = rep(c('blue', 'red'), times = if_else(two_dist, 2, 1)),
val = stats_vec
)
stats_df$pop <- factor(stats_df$pop, levels = c('dotted', 'dotdash'))
# Plot distribution(s)
p <- ggplot() +
ggtitle(plot_title) + xlab('') + ylab('') +
geom_density(aes(x = data_vec1), alpha = 0.8)
if (!two_dist || show_group_hist) { # show histogram only if 1 pop or show_group_hist is TRUE
p <- p +
geom_histogram(aes(x = data_vec1, y = ..density..), alpha = 0.4, position = 'identity')
}
if (two_dist) {
p <- p +
geom_density(aes(x = data_vec2), alpha = 0.8)
if (show_group_hist) { # show histogram only if show_group_hist is TRUE
p <- p +
geom_histogram(aes(x = data_vec2, y = ..density..), alpha = 0.4, position = 'identity', fill = 'wheat4')
}
}
p <- p +
geom_vline(data = stats_df,
aes(xintercept = val, color = interaction(stat, pop), linetype = interaction(stat, pop)),
size = 0.6, alpha = 0.8) +
scale_color_manual(name = legend_title,
labels = legend_text,
values = as.character(stats_df$stat)) +
scale_linetype_manual(name = legend_title,
labels = legend_text,
values = as.character(stats_df$pop)) +
theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
legend.title = element_text(size = 9, face = 'bold'),
legend.text = element_text(size = 8)) +
guides(col = guide_legend(ncol = if_else(two_dist, 2, 1)))
p
}# IPEDS data: tuitfee_grad_res
plot_distribution(df_ipeds_pop$tuitfee_grad_res)# IPEDS data: tuitfee_grad_res
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res')# IPEDS data: tuitfee_grad_res by control (default: use first 2 categorical values in group_var)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control')# IPEDS data: tuitfee_grad_res by control, shade histogram
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
show_group_hist = T)# IPEDS data: tuitfee_grad_res by control, custom order
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
group_cat = c(2, 1))# IPEDS data: tuitfee_grad_res by control, custom categories
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
group_cat = c(2, 3))# Alternatively, prepare 2 populations first to pass in
pop1 <- (df_ipeds_pop %>% filter(unclass(control) == 1))$tuitfee_grad_res
pop2 <- (df_ipeds_pop %>% filter(unclass(control) == 2))$tuitfee_grad_res
plot_distribution(pop1, pop2)# Custom labels
plot_distribution(pop1, pop2, pop_labels = c('Public', 'Private not-for-profit'))plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
group_cat = c(1, 2), pop_labels = c('Public', 'Private not-for-profit'))# Take single random sample of size 200
set.seed(124)
df_generated_sample <- df_generated_pop[sample(nrow(df_generated_pop), 200), ]
set.seed(124)
df_ipeds_sample <- df_ipeds_pop[sample(nrow(df_ipeds_pop), 200), ]
# Randomly generated data: standard normal
plot_distribution(df_generated_sample$stdnorm_dist)# IPEDS data: tuitfee_grad_res
plot_distribution(df_ipeds_pop$tuitfee_grad_res, sampling_dist = T)# IPEDS data: tuitfee_grad_res
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', sampling_dist = T) # sampling_dist = T runs the get_sampling_distribution function and plots the sampling distributionplot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', sampling_dist = F) # sampling_dist = F plots underlying variable# IPEDS data: tuitfee_grad_res by control (control=1 minus control=2)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
sampling_dist = T, pop_labels = c('Public', 'Private'))# IPEDS data: tuitfee_grad_res by control (control=2 minus control=1)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
group_cat = c(2, 1), sampling_dist = T, pop_labels = c('Private', 'Public'))# Function to generate t-distribution plot
plot_t_distribution <- function(data_vec1, data_vec2 = NULL, data_df = NULL, data_var = NULL, group_var = NULL, group_cat = NULL, mu = 0, alpha = 0.05, alternative = 'two.sided', plot_title = '', shade_rejection = T, shade_pval = T, stacked = F, beta_y = NULL, beta_x = NULL, beta_x_cat = 1) {
two_pop <- !is.null(group_var) || !is.null(data_vec2)
# Prep dataframe
if (!is.null(data_df) && is.null(beta_y)) {
data_df[[data_var]] <- unclass(data_df[[data_var]]) # unclass haven_labelled
if (two_pop) {
data_df[[group_var]] <- unclass(data_df[[group_var]])
}
data_df <- data_df %>% filter(!is.na(get(data_var))) # remove NA rows
# If group_cat not provided, use 2 values from group_var
if (two_pop && is.null(group_cat)) {
group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
}
# Create samples
if (!two_pop) { # single sample
data_vec1 <- data_df[[data_var]]
} else { # two samples
data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
}
}
# Calculate stats
if (!is.null(beta_y)) { # beta testing
mod <- summary(lm(formula = get(beta_y) ~ get(beta_x), data = data_df))
idx <- beta_x_cat + 1
deg_freedom <- mod$df[[2]]
std_err <- mod$coefficients[[idx, 'Std. Error']]
t <- mod$coefficients[[idx, 't value']]
} else if (!two_pop) { # single sample
data_vec1 <- na.omit(data_vec1)
# Calculate t-statistics
sample_size <- length(data_vec1)
deg_freedom <- sample_size - 1
xbar <- mean(data_vec1)
s <- sd(data_vec1)
std_err <- s / sqrt(sample_size)
t <- (xbar - mu) / std_err
} else { # two samples
data_vec1 <- na.omit(data_vec1)
data_vec2 <- na.omit(data_vec2)
# Calculate t-statistics
xbar1 <- mean(data_vec1)
xbar2 <- mean(data_vec2)
s1 <- sd(data_vec1)
s2 <- sd(data_vec2)
n1 <- length(data_vec1)
n2 <- length(data_vec2)
deg_freedom <- (s1**2/n1 + s2**2/n2)**2 / ((s1**2/n1)**2/(n1-1) + (s2**2/n2)**2/(n2-1))
std_err <- sqrt(s1**2/n1 + s2**2/n2)
t <- (xbar1 - xbar2) / std_err
}
# Calculate critical value and p-value
if (alternative == 'less') { # left-tailed
cv_lower <- qt(p = alpha, df = deg_freedom, lower.tail = T)
cv_legend <- round(cv_lower, 2)
cv_legend2 <- round(cv_lower * std_err + mu, 2)
pval <- round(pt(q = t, df = deg_freedom, lower.tail = T), 4)
} else if (alternative == 'greater') { # right-tailed
cv_upper <- qt(p = alpha, df = deg_freedom, lower.tail = F)
cv_legend <- round(cv_upper, 2)
cv_legend2 <- round(cv_upper * std_err + mu, 2)
pval <- round(pt(q = t, df = deg_freedom, lower.tail = F), 4)
} else { # two-tailed
cv_lower <- qt(p = alpha / 2, df = deg_freedom, lower.tail = T)
cv_upper <- qt(p = alpha / 2, df = deg_freedom, lower.tail = F)
cv_legend <- str_c('\u00B1', round(cv_upper, 2))
cv_legend2 <- str_c(round(cv_lower * std_err + mu, 2), ' & ', round(cv_upper * std_err + mu, 2))
pval_half <- round(pt(q = t, df = deg_freedom, lower.tail = t < 0), 4)
pval <- str_c(pval_half, ' + ', pval_half, ' = ', 2 * pval_half)
}
# Plot t-distribution
p <- ggplot(data.frame(x = -c(-4, 4)), aes(x)) +
ggtitle(plot_title) + xlab('') + ylab('') +
stat_function(fun = dt, args = list(df = deg_freedom), xlim = c(-4, 4))
# Shade rejection region using critical value
if (alternative != 'greater') {
p <- p + geom_vline(aes(xintercept = cv_lower, color = 'cval'),
linetype = 'dotted', size = 0.8, alpha = 0.8)
if (shade_rejection) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(-4, cv_lower),
geom = 'area', alpha = 0.3, fill = 'red')
}
if (shade_pval) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(-4, if_else(alternative == 'two.sided', -abs(t), t)),
geom = 'area', alpha = 0.3, fill = 'blue')
}
}
if (alternative != 'less') {
p <- p + geom_vline(aes(xintercept = cv_upper, color = 'cval'),
linetype = 'dotted', size = 0.8, alpha = 0.8)
if (shade_rejection) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(cv_upper, 4),
geom = 'area', alpha = 0.3, fill = 'red')
}
if (shade_pval) {
p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
xlim = c(if_else(alternative == 'two.sided', abs(t), t), 4),
geom = 'area', alpha = 0.3, fill = 'blue')
}
}
# Legend text
legend_text <- c('t-statistics / p-value', 'critical value / alpha')
if (stacked) {
legend_text <- c(str_c('t-statistics: ', round(t, 2),
'\n(p-value: ', str_extract(pval, '[\\d.-]+$'), ')'),
str_c('Critical value: ', cv_legend,
'\n(alpha: ', round(alpha, 2), ')'))
}
stats_text <- c(str_c('t-statistics: ', round(t, 2)),
str_c('SE: ', round(std_err, 2)),
str_c('p-val: ', pval),
str_c('Critical value: ', cv_legend),
str_c('alpha: ', round(alpha, 2)))
if (!stacked) {
p <- p +
annotate('text', size = 9*5/14, x = 4.84, y = 0.14, hjust = 0,
label = 'bold(Statistics)', parse = T) +
annotate('text', size = 8*5/14, x = 4.89, y = 0:4 * -0.015 + 0.12, hjust = 0,
label = stats_text)
}
# Label plot
p <- p +
geom_vline(aes(xintercept = t, color = 'tstat'),
linetype = 'dotted', size = 0.8, alpha = 0.8) +
scale_x_continuous(sec.axis = sec_axis(trans = ~ . * std_err + mu)) +
scale_color_manual(name = if_else(stacked, 'Statistics', 'Legend'),
breaks = c('tstat', 'cval'),
labels = legend_text,
values = c(tstat = 'blue', cval = 'red')) +
theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
plot.margin = unit(c(5.5, if_else(stacked, 5.5, 30), 5.5, 5.5), 'pt'),
legend.title = element_text(size = 9, face = 'bold'),
legend.text = element_text(size = 8)) +
coord_cartesian(xlim = c(-4, 4),
clip = 'off')
p
}# H0: population mean of coa_grad_res is $29,000
# Ha: population mean of coa_grad_res is NOT $29,000 (two-sided test)
# Verdict: Fail to reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 29000)##
## One Sample t-test
##
## data: df_ipeds_sample$coa_grad_res
## t = -1.7293, df = 183, p-value = 0.08545
## alternative hypothesis: true mean is not equal to 29000
## 95 percent confidence interval:
## 26313.28 29176.89
## sample estimates:
## mean of x
## 27745.08
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 29000)# H0: population mean of coa_grad_res is $32,000
# Ha: population mean of coa_grad_res is NOT $32,000 (two-sided test)
# Verdict: Reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 32000)##
## One Sample t-test
##
## data: df_ipeds_sample$coa_grad_res
## t = -5.8632, df = 183, p-value = 0.0000000208
## alternative hypothesis: true mean is not equal to 32000
## 95 percent confidence interval:
## 26313.28 29176.89
## sample estimates:
## mean of x
## 27745.08
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'coa_grad_res', mu = 32000)# H0: population mean of coa_grad_res is $33,000
# Ha: population mean of coa_grad_res is LESS THAN $33,000 (left-sided test)
# Verdict: Reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 33000, alternative = 'less')##
## One Sample t-test
##
## data: df_ipeds_sample$coa_grad_res
## t = -7.2412, df = 183, p-value = 0.000000000005962
## alternative hypothesis: true mean is less than 33000
## 95 percent confidence interval:
## -Inf 28944.82
## sample estimates:
## mean of x
## 27745.08
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 33000, alternative = 'less')# H0: population mean of coa_grad_res is $31,000
# Ha: population mean of coa_grad_res is GREATER THAN $31,000 (right-sided test)
# Verdict: Fail to reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 31000, alternative = 'greater')##
## One Sample t-test
##
## data: df_ipeds_sample$coa_grad_res
## t = -4.4852, df = 183, p-value = 1
## alternative hypothesis: true mean is greater than 31000
## 95 percent confidence interval:
## 26545.35 Inf
## sample estimates:
## mean of x
## 27745.08
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 31000, alternative = 'greater')# H0: population mean of tuitfee_grad_res is equal for control=1 and control=2
# Ha: population means are NOT equal
# Verdict: Reject H0
t.test(formula = tuitfee_grad_res ~ control, data = df_ipeds_sample,
subset = unclass(control) %in% c(1, 2))##
## Welch Two Sample t-test
##
## data: tuitfee_grad_res by control
## t = -7.1793, df = 111.77, p-value = 0.0000000000823
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11070.409 -6281.462
## sample estimates:
## mean in group 1 mean in group 2
## 9270.575 17946.510
# Here, the t-statistics line is off the grid
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'tuitfee_grad_res',
group_var = 'control', group_cat = c(1, 2))# Alternatively, prepare 2 samples first to pass in
sample1 <- (df_ipeds_sample %>% filter(unclass(control) == 1))$tuitfee_grad_res
sample2 <- (df_ipeds_sample %>% filter(unclass(control) == 2))$tuitfee_grad_res
t.test(sample1, sample2)##
## Welch Two Sample t-test
##
## data: sample1 and sample2
## t = -7.1793, df = 111.77, p-value = 0.0000000000823
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11070.409 -6281.462
## sample estimates:
## mean of x mean of y
## 9270.575 17946.510
plot_t_distribution(sample1, sample2)# compare locale = tuitfee_grad_nres between public and private
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'coa_grad_nres',
group_var = 'control', group_cat = c(1, 2))# compare locale = city-large (11) to suburb-large (21)
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'roomboard_off',
group_var = 'locale', group_cat = c(11, 21))summary(lm(formula = debt_all_stgp_eval_mean ~ coa_grad_res, data = df_socialwork))##
## Call:
## lm(formula = debt_all_stgp_eval_mean ~ coa_grad_res, data = df_socialwork)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25799 -5991 -926 5357 42453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13445.93950 1830.24898 7.347 0.00000000000285 ***
## coa_grad_res 0.96869 0.06069 15.962 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9435 on 251 degrees of freedom
## Multiple R-squared: 0.5038, Adjusted R-squared: 0.5018
## F-statistic: 254.8 on 1 and 251 DF, p-value: < 0.00000000000000022
plot_t_distribution(beta_y = 'debt_all_stgp_eval_mean', beta_x = 'coa_grad_res', data_df = df_socialwork)# 2 categories
summary(lm(formula = earn_mdn_hi_2yr ~ control, data = df_mba_fac))##
## Call:
## lm(formula = earn_mdn_hi_2yr ~ control, data = df_mba_fac)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38111 -13099 -4780 8208 119148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72104 1227 58.771 <0.0000000000000002 ***
## controlPrivate not-for-profit -1688 1683 -1.003 0.316
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20640 on 602 degrees of freedom
## (61 observations deleted due to missingness)
## Multiple R-squared: 0.001668, Adjusted R-squared: 9.24e-06
## F-statistic: 1.006 on 1 and 602 DF, p-value: 0.3164
plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'control', data_df = df_mba_fac)# More than 2 categories
summary(lm(formula = earn_mdn_hi_2yr ~ urban, data = df_mba_fac))##
## Call:
## lm(formula = earn_mdn_hi_2yr ~ urban, data = df_mba_fac)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44625 -12659 -3672 8778 118233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77303 1555 49.700 < 0.0000000000000002 ***
## urbanmed/small city -5972 2143 -2.787 0.00549 **
## urbansuburb -7049 2253 -3.128 0.00184 **
## urbantown/rural -15148 2550 -5.942 0.00000000479 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20100 on 600 degrees of freedom
## (61 observations deleted due to missingness)
## Multiple R-squared: 0.05629, Adjusted R-squared: 0.05157
## F-statistic: 11.93 on 3 and 600 DF, p-value: 0.0000001349
# Default would plot first comparison category (med/small city)
plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'urban', data_df = df_mba_fac)plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'urban', data_df = df_mba_fac, beta_x_cat = 1) # equivalent# Specify second comparison category
plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'urban', data_df = df_mba_fac, beta_x_cat = 2)# Specify third comparison category
plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'urban', data_df = df_mba_fac, beta_x_cat = 3)library(patchwork)
# Use / to place plots 1 per row (stacked)
plot_distribution(df_generated_pop$norm_dist, plot_title = 'Population distribution') /
plot_distribution(df_generated_sample$norm_dist, plot_title = 'Single sample distribution') /
plot_distribution(df_generated_pop$norm_dist, sampling_dist = T,
plot_title = 'Sampling distribution')# Or use + and plot_layout()
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
group_cat = c(2, 1), plot_title = 'Population distributions') +
plot_distribution(data_df = df_ipeds_sample, data_var = 'tuitfee_grad_res', group_var = 'control',
group_cat = c(2, 1), plot_title = 'Single sample distributions') +
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
group_cat = c(2, 1), sampling_dist = T,
plot_title = 'Sampling distribution differences') +
plot_layout(ncol = 1)# When including t-distribution plot, need to specify stacked = T
plot_distribution(df_ipeds_sample$coa_grad_res, plot_title = 'Population distribution') +
plot_distribution(df_ipeds_sample$coa_grad_res, plot_title = 'Single sample distribution') +
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 29000, stacked = T,
plot_title = 'Sampling distribution under null') +
plot_layout(ncol = 1)